/** * * APDPlat - Application Product Development Platform Copyright (c) 2013, 杨尚川, * yang-shangchuan@qq.com * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along with * this program. If not, see <http://www.gnu.org/licenses/>. * */ package org.apdplat.superword.tools; import org.apache.commons.lang.StringUtils; import org.apdplat.superword.model.Word; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Element; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.nio.file.Files; import java.nio.file.Paths; import java.util.*; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; /** * 更新词库工具 * @author 杨尚川 */ public class WordsFetcher { private WordsFetcher(){} private static final Logger LOGGER = LoggerFactory.getLogger(WordsFetcher.class); private static final String WORD_CSS_PATH = "html body div#main_block div.word_box form#word_form div.word_main ul li div.word_main_list_w span"; private static final String ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"; private static final String ENCODING = "gzip, deflate"; private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3"; private static final String CONNECTION = "keep-alive"; private static final String HOST = "www.iciba.com"; private static final String REFERER = "http://www.iciba.com/"; private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:36.0) Gecko/20100101 Firefox/36.0"; /** * 小学 */ public static void updatePrimarySchool(){ //小学牛津版 update(63, 5, "/word_primary_school.txt"); update(64, 3, "/word_primary_school.txt"); update(65, 3, "/word_primary_school.txt"); update(66, 3, "/word_primary_school.txt"); update(67, 4, "/word_primary_school.txt"); update(68, 4, "/word_primary_school.txt"); update(69, 4, "/word_primary_school.txt"); update(70, 5, "/word_primary_school.txt"); update(71, 5, "/word_primary_school.txt"); update(72, 4, "/word_primary_school.txt"); update(73, 14, "/word_primary_school.txt"); update(74, 10, "/word_primary_school.txt"); //小学深圳版 update(655, 8, "/word_primary_school.txt"); update(656, 8, "/word_primary_school.txt"); update(657, 2, "/word_primary_school.txt"); update(658, 3, "/word_primary_school.txt"); update(149, 3, "/word_primary_school.txt"); update(150, 4, "/word_primary_school.txt"); update(151, 3, "/word_primary_school.txt"); update(152, 4, "/word_primary_school.txt"); update(154, 6, "/word_primary_school.txt"); update(155, 8, "/word_primary_school.txt"); update(156, 8, "/word_primary_school.txt"); //河北版小学英语 update(265, 2, "/word_primary_school.txt"); update(266, 3, "/word_primary_school.txt"); update(267, 1, "/word_primary_school.txt"); update(268, 2, "/word_primary_school.txt"); update(269, 1, "/word_primary_school.txt"); update(271, 2, "/word_primary_school.txt"); update(272, 3, "/word_primary_school.txt"); } /** * 初中 */ public static void updateJuniorSchool(){ //初中牛津版 update(57, 27, "/word_junior_school.txt"); update(58, 24, "/word_junior_school.txt"); update(59, 21, "/word_junior_school.txt"); update(60, 15, "/word_junior_school.txt"); update(61, 20, "/word_junior_school.txt"); update(62, 16, "/word_junior_school.txt"); //初中人教版 update(105, 30, "/word_junior_school.txt"); update(106, 20, "/word_junior_school.txt"); update(107, 28, "/word_junior_school.txt"); update(108, 25, "/word_junior_school.txt"); update(109, 37, "/word_junior_school.txt"); //仁爱版 update(221, 19, "/word_junior_school.txt"); update(222, 19, "/word_junior_school.txt"); update(223, 18, "/word_junior_school.txt"); update(224, 17, "/word_junior_school.txt"); update(225, 12, "/word_junior_school.txt"); update(226, 8, "/word_junior_school.txt"); //初中河北版 update(273, 20, "/word_junior_school.txt"); update(224, 18, "/word_junior_school.txt"); update(226, 16, "/word_junior_school.txt"); update(227, 16, "/word_junior_school.txt"); update(228, 12, "/word_junior_school.txt"); update(229, 14, "/word_junior_school.txt"); //新初中人教版 update(728, 18, "/word_junior_school.txt"); update(729, 25, "/word_junior_school.txt"); //翼教版 update(678, 17, "/word_junior_school.txt"); } /** * 高中 */ public static void updateSeniorSchool(){ //高中牛津版 update(51, 19, "/word_senior_school.txt"); update(52, 25, "/word_senior_school.txt"); update(53, 24, "/word_senior_school.txt"); update(54, 20, "/word_senior_school.txt"); update(55, 25, "/word_senior_school.txt"); update(56, 23, "/word_senior_school.txt"); //高中人教版 update(110, 14, "/word_senior_school.txt"); update(111, 14, "/word_senior_school.txt"); update(112, 19, "/word_senior_school.txt"); update(113, 15, "/word_senior_school.txt"); update(114, 18, "/word_senior_school.txt"); update(118, 20, "/word_senior_school.txt"); update(119, 19, "/word_senior_school.txt"); //高考 update(139, 5, "/word_senior_school.txt"); update(140, 194, "/word_senior_school.txt"); } /** * 大学 */ public static void updateUniversity() { //大学英语精读 update(45, 27, "/word_university.txt"); update(46, 37, "/word_university.txt"); update(47, 40, "/word_university.txt"); update(48, 46, "/word_university.txt"); update(49, 25, "/word_university.txt"); update(50, 65, "/word_university.txt"); } /** * 新概念英语 */ public static void updateNewConception() { update(41, 41, "/word_new_conception.txt"); update(42, 49, "/word_new_conception.txt"); update(43, 81, "/word_new_conception.txt"); update(44, 76, "/word_new_conception.txt"); } public static void updateCET4(){ update(11, 226, "/word_CET4.txt"); update(122, 35, "/word_CET4.txt"); } public static void updateCET6(){ update(12, 105, "/word_CET6.txt"); update(123, 25, "/word_CET6.txt"); } public static void updateKY(){ update(13, 274, "/word_KY.txt"); update(143, 3, "/word_KY.txt"); } public static void updateTOEFL(){ update(14, 245, "/word_TOEFL.txt"); } public static void updateIELTS(){ update(15, 228, "/word_IELTS.txt"); } public static void updateGRE(){ update(16, 375, "/word_GRE.txt"); } public static void updateGMAT(){ update(36, 40, "/word_GMAT.txt"); update(37, 54, "/word_GMAT.txt"); update(38, 108, "/word_GMAT.txt"); } public static void updateTOEIC(){ update(682, 42, "/word_TOEIC.txt"); } public static void updateSAT(){ update(121, 11, "/word_SAT.txt"); } public static void updateBEC(){ update(680, 47, "/word_BEC.txt"); update(681, 10, "/word_BEC.txt"); } public static void updateADULT(){ update(703, 144, "/word_ADULT.txt"); update(704, 284, "/word_ADULT.txt"); update(705, 143, "/word_ADULT.txt"); update(706, 11, "/word_ADULT.txt"); update(707, 198, "/word_ADULT.txt"); update(708, 171, "/word_ADULT.txt"); update(709, 89, "/word_ADULT.txt"); update(710, 61, "/word_ADULT.txt"); update(711, 180, "/word_ADULT.txt"); } public static void updateMBA(){ update(39, 243, "/word_MBA.txt"); } public static void updateTEM4(){ update(90, 105, "/word_TEM4.txt"); } public static void updateTEM8(){ update(91, 47, "/word_TEM8.txt"); } public static void updateCATTI(){ update(715, 70, "/word_CATTI.txt"); update(716, 35, "/word_CATTI.txt"); update(717, 94, "/word_CATTI.txt"); } /** * 计算机常用词汇 */ public static void updateComputer() { update(78, 191, "/word_computer.txt"); } /** * 其他 */ public static void updateOther() { //医学 update(75, 58, "/words.txt"); update(76, 46, "/words.txt"); update(77, 27, "/words.txt"); //金融 update(79, 118, "/words.txt"); //交友 update(80, 18, "/words.txt"); //求职 update(81, 11, "/words.txt"); //人力资源 update(97, 34, "/words.txt"); //人力资源 update(98, 14, "/words.txt"); //建筑 update(147, 92, "/words.txt"); //化学(高分子) update(721, 17, "/words.txt"); //有用的单词 update(712, 3, "/words.txt"); update(713, 163, "/words.txt"); //美国英语 update(363, 29, "/words.txt"); update(364, 25, "/words.txt"); update(365, 46, "/words.txt"); update(366, 50, "/words.txt"); update(355, 31, "/words.txt"); //基础词汇 update(362, 59, "/words.txt"); //柯林斯和牛津 update(361, 54, "/words.txt"); update(358, 55, "/words.txt"); update(359, 33, "/words.txt"); update(293, 49, "/words.txt"); update(125, 24, "/words.txt"); update(125, 24, "/words.txt"); update(126, 42, "/words.txt"); update(127, 60, "/words.txt"); update(128, 109, "/words.txt"); update(129, 212, "/words.txt"); update(294, 53, "/words.txt"); update(725, 122, "/words.txt"); //其他 update(720, 7, "/words.txt"); update(726, 3, "/words.txt"); update(676, 19, "/words.txt"); update(175, 26, "/words.txt"); update(144, 13, "/words.txt"); update(145, 19, "/words.txt"); update(146, 11, "/words.txt"); update(99, 12, "/words.txt"); update(87, 2, "/words.txt"); update(83, 7, "/words.txt"); update(84, 11, "/words.txt"); update(85, 6, "/words.txt"); update(86, 11, "/words.txt"); update(153, 13, "/words.txt"); } public static void update(int type, int pageNumber, String file){ file = "src/main/resources"+file; Set<Word> existWords = WordSources.get(file); Set<Word> words = fetch(type, pageNumber); LOGGER.debug("已经存在的词数:"+existWords.size()); LOGGER.debug("新获取到的词数:"+words.size()); words.addAll(existWords); LOGGER.debug("新旧合并之后的词数:"+words.size()); AtomicInteger i = new AtomicInteger(); List<String> allWords = words .stream() .sorted() .map(w -> i.incrementAndGet() + "\t" + w.getWord()) .collect(Collectors.toList()); try{ Files.write(Paths.get(file), allWords); }catch (Exception e){ LOGGER.error("保存词汇失败", e); } } public static Set<Word> fetch(int type, int pageNumber){ Set<Word> words = new HashSet<>(); String url = "http://word.iciba.com/?action=words&class="+type+"&course="; for (int i=1; i<=pageNumber; i++){ String html = getContent(url+i); int times = 1; while (StringUtils.isBlank(html) && times<4){ times++; //使用新的IP地址 DynamicIp.toNewIp(); html = getContent(url+i); } //LOGGER.debug("获取到的HTML:" +html); while(html.contains("非常抱歉,来自您ip的请求异常频繁")){ //使用新的IP地址 DynamicIp.toNewIp(); html = getContent(url+i); } words.addAll(parse(html)); } LOGGER.debug("url:"+url+",获取到的词数:"+words.size()); return words; } public static Set<Word> parse(String html){ Set<Word> words = new HashSet<>(); try { for(Element element : Jsoup.parse(html).select(WORD_CSS_PATH)){ String word = element.text().trim(); if(StringUtils.isNotBlank(word) && WordSources.isEnglish(word)){ words.add(new Word(word, "")); LOGGER.debug("解析出单词:" + word); } } }catch (Exception e){ LOGGER.error("解析单词出错", e); } return words; } public static String getContent(String url) { LOGGER.debug("url:"+url); Connection conn = Jsoup.connect(url) .header("Accept", ACCEPT) .header("Accept-Encoding", ENCODING) .header("Accept-Language", LANGUAGE) .header("Connection", CONNECTION) .header("Referer", REFERER) .header("Host", HOST) .header("User-Agent", USER_AGENT) .ignoreContentType(true); String html = ""; try { html = conn.post().html(); html = html.replaceAll("[\n\r]", ""); }catch (Exception e){ LOGGER.error("获取URL:"+url+"页面出错", e); } return html; } public static void main(String[] args) { updatePrimarySchool(); updateJuniorSchool(); updateSeniorSchool(); updateUniversity(); updateNewConception(); updateCET4(); updateCET6(); updateKY(); updateTOEFL(); updateIELTS(); updateGRE(); updateGMAT(); updateTOEIC(); updateSAT(); updateBEC(); updateADULT(); updateMBA(); updateTEM4(); updateTEM8(); updateCATTI(); updateComputer(); updateOther(); } }